{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://malaya-dataset.s3-ap-southeast-1.amazonaws.com/crawler/academia/academia-pdf.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import cleaning\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "224"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('academia-pdf.json') as fopen:\n",
    "    pdf = json.load(fopen)\n",
    "    \n",
    "len(pdf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from unidecode import unidecode\n",
    "\n",
    "def clean(string):\n",
    "    string = [cleaning.cleaning(s) for s in string]\n",
    "    \n",
    "    string = [s.strip() for s in string if 'tarikh' not in s.lower() and 'soalan no' not in s.lower()]\n",
    "    string = [s for s in string if not ''.join(s.split()[:1]).isdigit() and '.soalan' not in s.lower() and 'jum ' not in s.lower()]\n",
    "    string = [s for s in string if not s[:3].isdigit() and not s[-3:].isdigit()]\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 224/224 [00:31<00:00,  7.15it/s]\n"
     ]
    }
   ],
   "source": [
    "outer = []\n",
    "\n",
    "for k in tqdm(range(len(pdf))):\n",
    "\n",
    "    c = clean(pdf[k]['content']['content'].split('\\n'))\n",
    "    t, last = [], 0\n",
    "\n",
    "    i = 0\n",
    "    while i < len(c):\n",
    "        text = c[i]\n",
    "\n",
    "        if len(text) > 5:\n",
    "            if len(text.split()) > 1:\n",
    "                t.append(text)\n",
    "            last = i\n",
    "        else:\n",
    "            if len(t) and (i - last) > 2:\n",
    "                t.append('')\n",
    "                outer.extend(t)\n",
    "                t = []\n",
    "                last = i\n",
    "            elif not len(t):\n",
    "                last = i\n",
    "\n",
    "        i += 1\n",
    "    \n",
    "    if len(t):\n",
    "        t.append('')\n",
    "        outer.extend(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "651957"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(outer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 462 ms, sys: 216 ms, total: 678 ms\n",
      "Wall time: 772 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "temp_vocab = list(set(cleaning.multiprocessing(outer, cleaning.unique_words)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "881\n",
      "CPU times: user 104 ms, sys: 111 ms, total: 215 ms\n",
      "Wall time: 655 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "# important\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.duplicate_dots_marks_exclamations, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 651957/651957 [00:02<00:00, 282625.60it/s]\n"
     ]
    }
   ],
   "source": [
    "outer = cleaning.string_dict_cleaning(outer, temp_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "71\n",
      "CPU times: user 113 ms, sys: 117 ms, total: 230 ms\n",
      "Wall time: 356 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "# important\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.remove_underscore, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 651957/651957 [00:02<00:00, 281336.39it/s]\n"
     ]
    }
   ],
   "source": [
    "outer = cleaning.string_dict_cleaning(outer, temp_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "CPU times: user 223 ms, sys: 129 ms, total: 351 ms\n",
      "Wall time: 928 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "# important\n",
    "temp_dict = cleaning.multiprocessing(outer, cleaning.isolate_spamchars, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2631\n",
      "CPU times: user 112 ms, sys: 108 ms, total: 220 ms\n",
      "Wall time: 258 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.break_short_words, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 651957/651957 [00:02<00:00, 273863.82it/s]\n"
     ]
    }
   ],
   "source": [
    "outer = cleaning.string_dict_cleaning(outer, temp_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "765\n",
      "CPU times: user 90.7 ms, sys: 116 ms, total: 207 ms\n",
      "Wall time: 261 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.break_long_words, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 651957/651957 [00:02<00:00, 284637.14it/s]\n"
     ]
    }
   ],
   "source": [
    "outer = cleaning.string_dict_cleaning(outer, temp_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10\n",
      "CPU times: user 104 ms, sys: 104 ms, total: 208 ms\n",
      "Wall time: 323 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.remove_ending_underscore, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 651957/651957 [00:02<00:00, 271179.24it/s]\n"
     ]
    }
   ],
   "source": [
    "outer = cleaning.string_dict_cleaning(outer, temp_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "56\n",
      "CPU times: user 98.9 ms, sys: 107 ms, total: 206 ms\n",
      "Wall time: 323 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.remove_starting_underscore, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 651957/651957 [00:02<00:00, 279244.09it/s]\n"
     ]
    }
   ],
   "source": [
    "outer = cleaning.string_dict_cleaning(outer, temp_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "138699\n",
      "CPU times: user 510 ms, sys: 124 ms, total: 634 ms\n",
      "Wall time: 723 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.end_punct, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 651957/651957 [00:02<00:00, 251827.17it/s]\n"
     ]
    }
   ],
   "source": [
    "outer = cleaning.string_dict_cleaning(outer, temp_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "37464\n",
      "CPU times: user 211 ms, sys: 152 ms, total: 363 ms\n",
      "Wall time: 380 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.start_punct, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 651957/651957 [00:02<00:00, 242230.75it/s]\n"
     ]
    }
   ],
   "source": [
    "outer = cleaning.string_dict_cleaning(outer, temp_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "65\n",
      "CPU times: user 89.4 ms, sys: 144 ms, total: 233 ms\n",
      "Wall time: 285 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.join_dashes, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 651957/651957 [00:02<00:00, 256821.01it/s]\n"
     ]
    }
   ],
   "source": [
    "outer = cleaning.string_dict_cleaning(outer, temp_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Indonesia : Dar Ihya al kutub , h . 140-141 jw 1,.*Tim Penyusun ( 2003 ), Ensiklopedi Islam untuk pelajar . Kuala',\n",
       " 'Lumpur : Era Visi Publication , 2003 h.75 .*\\'\" Tahir Mahmood ( 1993 ), Human Rights in Islamic Law.New',\n",
       " \"Delhi : Institute of Objektive Studies , h , 4-11.Abul ' ala al -\",\n",
       " 'Mahududi ( 1985 ), Hak asasi manusia dalam Islam . Bandung :',\n",
       " \"Pustaka h . l9-20 .*- Abdul Wahab Khalaf ( 2003 ), Siasah Syari'ah dalam\",\n",
       " 'pemerintahan Islan . Kuala Lumpur : al-Hidayah',\n",
       " \"*'Republika , 07 Jmuai 2009 ** Tim Penlusun . ( 1999 ) Ensiklopedi Hukum Islam . Jakarta :\",\n",
       " '\\' o\"iii Al-Imam Muhammad Ab,t Zahrah ( 1424 . H ) usut Fikh .',\n",
       " '* r Ibn Katsir ( 2004 ) Tafsir lbn Katsir . Fiiyadh : Dr Salm , jilid 2 ,',\n",
       " 'h . 835n Hu*kr , (1990 ) Tafsir al-Azhar , jilid 6 . Singapura : Pustaka',\n",
       " 'n Ibn Hajr a1-Asqhalani ( 2004 ) Bulughul Marom . fuyadh : Dar us -',\n",
       " \"'riiiM . ali Ash-Shabuni ( 1994 ) Tafsir Ayat-Ayat Hukum Dalam al -\",\n",
       " \"Qur'an (terj ). Bandung : aLma'ari { h . 290il ' Fat*u Mufti Kerajam Bmei Darussalam . ( 2007 ) Isu-isu\",\n",
       " \".r ' Sayyid Qutb ( 2000 ) Tafsir Fi Zilalit Qura'n . Kuala Lumpur :\",\n",
       " 'Pustaka Aman Press , h . 162 .',\n",
       " '\" M Muhammad Nashiruddin al-Bani ( 2005 ) Ensiklopedi Fatwa',\n",
       " 'Syeikh al-Bani , penlusm Mahmudz Ahmad Rasyid , terj . . Jakarta :',\n",
       " 'jilid 13',\n",
       " \"du ' Diriwayatkan oleh Muslim . Lihat Imm Muhamad Bin al -\",\n",
       " 'Kahlani ( 1182 H ). Bandung : Dahbn,h . 72 * Muasassah al-Kitab as-Sittah.(2000 ) Riyadh : Dar as-Salam , h .',\n",
       " \"Klaten : Wafa '\",\n",
       " 'r Sayyid Sabiq ( 1978 ) ibid ., h . 1 l9',\n",
       " 'ri Yusuf Qaradhawi ( 2007 ) Halal dan Haram . Jakarta : Rabbani',\n",
       " \"Press , terj . Abu Sa'id al-Falahi . h 17-18\",\n",
       " 'fi Lolor * Ab Rahman (2009 ) 1su Halal Tiada Penghujung.K:uala',\n",
       " 'Lumpu : JAKIM , h.33ru Persatuan Penggguqp Pulau Pinang ( 2002 ) Makanan',\n",
       " \"Berbahaya . Pulau Pinang : Juta Print.I ' Panduar Persatuan Pengguna Pulau Pinang ( 2006 ) Halal\",\n",
       " 'Haram - Ptlat Pinang : Jutaprint , h . 78',\n",
       " \"'' i Subhi al-Saleh (1997 ) Politik dan Pentadbiran Dalam Islam .\",\n",
       " 'Kuala Lumpur : Jakim , (teri .).',\n",
       " 'l\"iji Ibnu Taimiyal ( 2005 ) Siasah Syar\\'iyyah Etika Potitik Islam .',\n",
       " 'Suabaya : fusalah Gusti , (terj .) h . 157k Muhamad Abdul Rau { 1991 , tlmmah the Muslim Nation .',\n",
       " 'KualaLumpur : DBP,h . 159 160 .',\n",
       " 'o Farooq Hassan , i981 , The Concept of State and Law in Islam -',\n",
       " 'England : University Press ofUSA , h . 32 .',\n",
       " 'tu H . Zainal Abidin Ahmad , 1974 , Negara Adil Makmur Menurut',\n",
       " 'Ibnu Sina Teori Kenegaraan dari Filosof dan Doktor Islan',\n",
       " '',\n",
       " 'bi ( 11 .), Al ashbahu wan',\n",
       " 'mdung : Abas h . 97-103.',\n",
       " 'alwj Kamus al Marbawi .',\n",
       " '4l jwl .',\n",
       " 'Islam untuk pelajar . Kluala',\n",
       " 'lights in hlamic Lalrr . New',\n",
       " \"es . h , 4-11.Abul ' ala al -\",\n",
       " 'b dalam Islam . Bandung :',\n",
       " \"Siasah Syari'ah dalam\",\n",
       " 'edi Hultam Islam . Jokarta :',\n",
       " 'rah ( 1424 . H ) Usul Fikh .',\n",
       " 'Riyadh : Dar Salam , jilid 2 ,',\n",
       " 'ilid 6 . Singapura : Pustaka',\n",
       " 'lul Marom . Riyadh : Dar us -',\n",
       " \"l1'at-Ayat Hukum Dalam al -\",\n",
       " 'nussalam . ( 2007 ) Isu-isu',\n",
       " \"lil Qura'n . Kuala Lumpur :\",\n",
       " '( 2005 ) Ensiklopedi Fatwa',\n",
       " 'hmad Rasyid , terj . . Jakarta :',\n",
       " 'Imam Muharnmad Bin al -',\n",
       " ')) Riyadh : Dar as-Salam , h .',\n",
       " 't Htram . Jakafia : Rabbani',\n",
       " 'il Tiad a P enghuj u n g . Kula',\n",
       " \"' inang ( 2002 ) Makanan\",\n",
       " 'lau Pinmg ( 2006 ) Halal',\n",
       " 'Pmtadbiran Dalam Islam .',\n",
       " ': ityah Etika Politik Islam .',\n",
       " 'mmah the Muslim Nation ,',\n",
       " 'of State and Law in Islam ,',\n",
       " '7oa Adil Mabnur Menurut',\n",
       " 'Filosof dan Doktor Islam',\n",
       " '\\\\ friacli Bin Sanusi',\n",
       " ': r -1-i \\\\ llh',\n",
       " \"N'lcrrbonskar Rahsia Pendidikan Islam Afriadi Bir Sanusi\",\n",
       " 'Kaliber Internasional , Jakarta : Pener Bintang , h . 278 -',\n",
       " 'http : / / w.halalmui. org/index.php?option:com content&viera',\n",
       " 'rticle&id - 1 7 5 &Itemid =',\n",
       " 'Lxiii Rahayu Hartini . Kedudukan Fatwa MUI Mengenai',\n",
       " 'Penyelesaian Sengketa Melalui Basyamas Pasca Lahimya UU No',\n",
       " 'Penelitian Universitas Muhammadiyah Malang , June 2007 .',\n",
       " 'nu Hasnan Kasan ( 2008 ) Institusi Fatwa di Malaysia . Bangi :',\n",
       " 'UKM , h 61',\n",
       " 'hi Dil-tip dmi buku pandum Sijil Halal Majlis Ulama Indonesia',\n",
       " 'M go.id / per atuanl2009 tJrJYo20 1 8.pdf',\n",
       " 'kuilhttp : / / elmu.umm.ac.idlfi le.php / I / produkliukumitIiJAJU%206 -',\n",
       " 'dan yang tidak halal .',\n",
       " ': / / docs. google . com . / viewer?u1:h@: / / w.dpr. go . idluu / uu 1 9',\n",
       " '96NU_1996 t.pdf',\n",
       " 'M htp : / / ngada.or gl pp22 - 19 83.htmM . / viewer?ur1:',\n",
       " 'indonesia . or.id / enm / images / dokumen { KADIN',\n",
       " 'kil http : / / ditl ennak . go.id/publikasi/sosialisai 6.pdf',\n",
       " \"Mu 999 / uu-8- 1 999.pdfk ' ,DokLunenUU.php / 148.pdf\",\n",
       " \"http : / / sj sn . menkokesra . go . id / dokum enl p eru'u I 1 9 9 2 I w23 19 9 2 ind\",\n",
       " 'kili http : / / gov.my / web / guesL4ranya-j akim-dan-main -',\n",
       " 'keluarkan-sij il halal di masa depan',\n",
       " 'r*\"iii 1 0 / 04 / syarikalpengeluar-sijil-',\n",
       " 'h @ : / / w.bhmian. com.my / bhrian / articles / Jakimbadantunggalpe',\n",
       " 'nsij ilanhalalmtukelakkekeliruan- /',\n",
       " \"b:Utusan Malaysia&sec :*' gov.my / apa-itu-fatwa\",\n",
       " \"M'Hasnah Hasan , op.cit\",\n",
       " 'wn http : I h adio24 . com.my / player / player.html',\n",
       " '']"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outer[-100:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dumping-pdf.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(outer))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
